
# this script generates quantities mentioned in the paper outside of tables/figures

source('read_dataset.R')
source('utils.R')

# analysis dataset stacked:

df12 = open_dataset('df12.parquet') %>% mutate(cycle=2012)
df16 = open_dataset('df16.parquet') %>% mutate(cycle=2016)
df20 = open_dataset('df20.parquet') %>% mutate(cycle=2020)

df = union_all(df12, df16) %>% union_all(df20)


# Observation totals ------------------------------------------------------

# individual -- cycle observations (analysis dataset)
df %>% tally() %>% collect() # 232,627,552

# unique property owners who are registered to vote in analysis dataset
df %>% distinct(component) %>% count() %>% collect() # 108,388,166

# unique individuals in top 0.1 in analysis dataset
df %>% filter(nat_quant == '0.999') %>% distinct(component) %>% tally() %>% collect() # 146,711


# Wealth measure L2 corr, 2020 --------------------------------------------

xx = open_dataset('data/final_long/cycle=2020') %>%
  filter(in_l2 == 1, in_cl == 1) %>%
  select(home_val, total) %>%
  filter(!is.na(home_val), !is.na(total)) %>%
  collect()

with(xx, cor(home_val, total, 'complete.obs', 'pearson')) ## 0.453182
with(xx %>% filter(total < 5e6), cor(home_val, total, 'complete.obs', 'pearson')) ## 0.6207089


# Trump 2020 dollar %s from groups ----------------------------------------

df20 %>%
  filter(!is.na(gender), r_val > 0) %>%
  group_by(gender) %>%
  summarise(tot = sum(r_val)) %>%
  transmute(gender, tot = tot/sum(tot)) %>%
  filter(gender == 'M') %>%
  collect() ## 0.667

df20 %>%
  filter(!is.na(ethnicity), r_val > 0) %>%
  group_by(ethnicity) %>%
  summarise(tot = sum(r_val)) %>%
  transmute(ethnicity, tot = tot/sum(tot)) %>%
  filter(ethnicity == 'W') %>%
  collect() ## 0.889

df20 %>%
  filter(education != 'Unkn', !is.na(ethnicity), r_val > 0) %>%
  group_by(college_w = education %in% c('Bach', 'Grad') & ethnicity == 'W') %>%
  summarise(tot = sum(r_val)) %>%
  transmute(college_w, tot = tot/sum(tot)) %>%
  filter(college_w == T) %>%
  collect() ## 0.616


# Appendix A.2 -- fastLink comparison: ------------------------------------

l2_ca16 = open_dataset('data/l2/year=2016/state_file=ca') %>%
  filter(address_type == 'r') %>% # residential
  collect()

cl_ca16 = open_dataset('data/cl_in') %>%
  filter(fips_st == 6, vintage == 2016, address_type == 's') %>% # parcel addrs
  collect()

setnames(l2_ca16, c('first_m', 'last_sf'), c('first', 'last'))
setnames(cl_ca16, 'first_m', 'first')

match_on = c('first', 'last', 'address')

# sample ZIPs
set.seed(0)
sample = l2_ca16[, .N, zip][N > 1000][sample(.N, 20), ]$zip
## 95032 92653 91706 94303 95338 90039 90049 94619 95829 94577
## 93650 95953 96145 92121 95388 92119 94549 96093 92392 92104

outs = list()

for(i in 1:length(sample)){
  
  this_zip = sample[i]
  
  outs[[i]] = fastLink(
    l2_ca16[zip==this_zip],
    cl_ca16[zip==this_zip],
    varnames = match_on,
    stringdist.match = match_on,
    partial.match = match_on,
    verbose = T
    )
  
}

matches = list()

for(i in 1:length(sample)){
  
  matches[[i]] <- getMatches(
    l2_ca16[zip==sample[i]],
    cl_ca16[zip==sample[i]],
    outs[[i]],
    threshold.match = 0.85,
    combine.dfs=F
  )
}

matches = bind_rows(lapply(matches, \(x)
  bind_cols(
    x[[1]][,5:12],
    x[[2]][,2:7],
    x[[1]][,56:59]
  )))

matches = matches[sample(nrow(matches), nrow(matches)), ]

matches %>%
  group_by(gamma.1, gamma.2, gamma.3) %>%
  summarise(avg_prob = mean(posterior, na.rm = T), n = n()) %>%
  arrange(avg_prob)
##  first with N >> 0 is (0, 2, 2) with avg_prob 0.903

filter(matches, gamma.1 == 0, gamma.2 == 2, gamma.3 == 2)
## on inspection, almost entirely false positives (based on first names); N = 8225

nrow(matches) ## 109,669

summary(aggregateEM(outs))

#                 95%       85%     75%   Exact
# 1 Match Count  101900  109685  168681   86981
# 2  Match Rate 44.251% 47.289% 69.737% 37.799%
# 3         FDR  0.072%  0.791%  7.813%        
# 4         FNR 44.375% 40.556% 12.337%   
